# Data preparation file
# Santiago Lopez-Cariboni, 2022. "Political Regimes and Informal Social Insurance", Comparative Political Studies.

# ==== wd ====
wd <- '~/Dropbox/Research/Papers/ElectricityTheft/PoliticalRegimesLosses/replication_materials/data' # adjust accordingly
setwd(wd)

# ==== packages ====
pkgs <- c("data.table", "dplyr", "lubridate", "readstata13", "neverhpfilter", "mFilter", 'countrycode',"democracyData", "WDI")
suppressPackageStartupMessages(sapply(pkgs,require,character.only=T))
	


# ==== datasets ====
#--  Create panel from the "countrycode" package
panel <- codelist_panel 
setDT(panel)

# ---- Wold Bank data ----
# new_wdi_cache <- WDIcache() 
# x <- c(
# 	gdp.pcap        = 'NY.GDP.PCAP.KD', # GDP per capita (constant 2010 US$)
# 	gdp.pcap.growth = 'NY.GDP.PCAP.KD.ZG', # GDP per capita growth (annual %)
# 	gdp.growth      = 'NY.GDP.MKTP.KD.ZG', # GDP growth (annual %)
#   gdp.currusd     = 'NY.GDP.MKTP.CD', # # GDP, current USD, millions
# 	dependency      = 'AG.LND.TOTL.K2', # dependency-age population as share of total pop
# 	land            = 'SP.POP.DPND', # total land in square kilometers
# 	pop             = 'SP.POP.TOTL', # total population
# 	pop.density     = 'EN.POP.DNST', # population density
# 	ind.emply       = 'SL.IND.EMPL.ZS', # employment in industry as share of total employment
# 	ind.gdp         = 'NV.IND.TOTL.ZS', # value added in industry as share of GDP
# 	imports         = 'NE.IMP.GNFS.ZS', # imports of goods and services as share of gdp
# 	exports         = 'NE.EXP.GNFS.ZS', # exports of goods and services as share of gdp
# 	trade           = 'NE.TRD.GNFS.ZS', # total trade as share of gdp
# 	fdi             = 'BX.KLT.DINV.WD.GD.ZS', # foreign-direct investment as share of
# 	tdlosses        = 'EG.ELC.LOSS.ZS', # Electric power transmission and distribution losses (% of output)
# 	elec.access     = 'EG.ELC.ACCS.ZS', # Access to electricity (% of population)
# 	elec.output     = '4.1.1_TOTAL.ELECTRICITY.OUTPUT', # Total electricity output (GWh)
# 	ppi.energy      = 'IE.PPI.ENGY.CD', # Investment in energy with private participation (current US$)
# 	elec.cons.pc    = 'EG.USE.ELEC.KH.PC', # Electric power consumption (kWh per capita)
# 	co2             = 'EN.ATM.CO2E.KT', # CO2 emissions (kt)
# 	govcons.gdp	    = 'NE.CON.GOVT.ZS', # Gov consumption gdp 
# 	exp.edu.currusd = 'NY.ADJ.AEDU.CD', # Adjusted savings: education expenditure (current US$)
# 	tax.gdp         = 'GC.TAX.TOTL.GD.ZS' # Tax revenue (% of GDP)

# )
# dat <- WDI(indicator = x, extra = TRUE, start = 1960, end = 2020)
# setDT(dat)
# dat <- dat[region != "Aggregates", ]
# write.csv(dat, 'wdi.dat.csv', row.names=FALSE)
dat <- fread('wdi.dat.csv')

#-- DPI --
dpi <- fread('Database_of_Political_Institutions_2017.csv')

# ---- Regime type data from: democracyData, by Xavier Marquez at: https://xmarquez.github.io/democracyData/index.html ----
# remotes::install_github("xmarquez/democracyData")

# -- BMR: The Boix-Miller-Rosato dichotomous coding of democracy, 1800-2015, version 3.0 --
# bmr <- bmr
# setDT(bmr)
# write.csv(bmr, 'bmr.csv', row.names=FALSE)
bmr <- fread('bmr.csv')

## -- PolityIV: The Polity IV dataset --
# polityIV <-  polityIV
# setDT(polityIV)
# write.csv(polityIV, 'polityIV.csv', row.names=FALSE)
polityIV <- fread('polityIV.csv')


## -- GWF: The Geddes Wright and Frantz Autocratic Regimes dataset --
# Geddes B, Wright J, Frantz E (2014). “Autocratic Breakdown and Regime Transitions: A New Data Set.” Perspectives on Politics, 12(1), 313-331. doi: 10.1017/S1537592714000851.
# gwf <- redownload_gwf(verbose = FALSE)
# table(gwf$gwf_regimetype, gwf$gwf_party)
# write.csv(gwf, 'gwf.csv', row.names=FALSE)
gwf <- fread('gwf.csv')

## -- GWF, personalism --
personal <- fread('GWF+personalism-scores_unique.csv')


## -- Time horizons --
bak_moon_fail <- fread("bak_moon_fail.csv")
bak_moon_fail <- merge(bak_moon_fail, panel[, .(year, cown, iso3c)], by.x=c("year", "ccode"), by.y=c("year", "cown"), all.x=TRUE)

## -- GWF, Pr(gwf_fail) --
fail <- fread('fail.csv')

# -- State capacity data --
# Hanson, Jonathan K. and Rachel Sigman. 2021. “Leviathan’s Latent Dimensions: Measuring State Capacity for Comparative Political Research.” The Journal of Politics 83(4):1495–1510.
stcap <- read.dta13("StateCapacityDataset_v1.dta")
setDT(stcap)

# -- V-dem dateset -- 
vdem  <- fread("V-Dem-CY-Full+Others-v12.csv")

# -- Privatization Data --
# Power Sector Reform Tracker (PSRT)
# Urpelainen, Johannes and Joonseok Yang. Forthcoming. "Global Patterns of Power Sector Reform, 1982-2013." Energy Strategy Reviews.
psrt_db <- read.dta13("PSRT_V1_Database.dta")
setDT(psrt_db)

# -- Eelectricity Access Data --
gedb <- read.dta13("Electrification_Database.dta")
setDT(gedb)

# -- Electricity Prices Data --
# IEA
iea.prices <- fread("IEA_WEP_SE_18042022182723681.csv")

# -- Variables affecting electricity prices -- 

# Additional data from WDI
wdi.dat_energy  <- fread("wdi.dat_energy.csv")

# Statistical Review of World Energy
crude  <- fread("crude_prices_2020USD_Statistical Review of World Energy.csv")


# ==== merging datasets ====

dt <- merge(panel, bmr[!is.na(cown),], by.x = c("cown", "year"), by.y=c("cown", "year"), all.x=TRUE)
dt <- merge(dt, polityIV[!is.na(cown),], by.x = c("cown", "year"), by.y=c("cown", "year"), all.x=TRUE)
dt <- merge(dt, gwf[!is.na(cown),], by.x = c("cown", "year"), by.y=c("cown", "year"), all.x=TRUE)
dt <- merge(dt, stcap[!is.na(iso3),], by.x = c("iso3c", "year"), by.y=c("iso3", "year"), all.x=TRUE)
dt <- merge(dt, psrt_db[,.(cntry, year, r_prv, r_cos, r_cor,r_reg)], by.x = c("country.name.en", "year"), by.y=c("cntry", "year"), all.x=TRUE)
dt <- merge(dt, gedb[,], by.x = c("country.name.en", "year"), by.y=c("countryname", "year"), all.x=TRUE)
dt <- merge(dt, iea.prices, by.x = c("country.name.en", "year"), by.y=c("Country", "Time"), all.x=TRUE)
dt <- merge(dt, vdem, by.x = c("vdem", "year"), by.y=c("country_id", "year"), all.x=TRUE)
dt <- merge(dt, fail, by=c("year", "iso3c"), all.x=TRUE)
dt <- merge(dt, bak_moon_fail, by.x=c("year", "iso3c"), by.y=c("year", "iso3c"), all.x=TRUE)
dt <- merge(dt, personal, by.x=c("year", "cown"), by.y=c("year", "cowcode"), all.x=TRUE)
dt <- merge(dt, dpi, by.x=c("year", "iso3c"), by.y=c("year", "iso3c"), all.x=TRUE)
dt <- merge(dt, dat, by=c("iso3c", "year"), all.y=TRUE)
dt <- dt[!is.na(iso3c), ]
dt$unique_id <- paste(dt$iso3c,dt$year) # concatenate to make unique ID
dt$duplicate = duplicated(dt$unique_id) # generate the duplicate variable
subset(dt, duplicate=="TRUE") # find the duplicate
dt <- dt[duplicate==FALSE, ]
dt <- merge(dt, wdi.dat_energy, by=c("iso3c", "year"), all.x=TRUE)
dt <- merge(dt, crude, by.x=c("year"), by.y=c("Year"), all.x=TRUE)

# keep 1960 onwards
dt <- dt[year>=1960, ]


# ==== Covariates ====

# -- OECD dummy
source("../code/functions/ctry.codes.R")
dt$isoname <- ctry.codes(ocode = "iso3c" , dcode ="isoname" , dt$iso3c)
dt$developed <- 0
dt$developed <- ifelse(dt$isoname=="Australia", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Austria", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Belgium", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Belgium-Luxembourg", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Luxembourg", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Canada", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Denmark", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Finland", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="France", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Germany", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="German Democratic Republic", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Greece", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Iceland", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Ireland", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Italy", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Japan", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Netherlands", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="New Zealand", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Norway", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Portugal", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Spain", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Sweden", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="Switzerland", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="United Kingdom", 1, dt$developed) 
dt$developed <- ifelse(dt$isoname=="United States", 1, dt$developed) 
dt$developed <- ifelse(is.na(dt$developed), 0, dt$developed) 
dt$LDC <- 0
dt$LDC <- ifelse(dt$developed==0, 1, dt$LDC)

# keep 1940 onwards
dt <- dt[year>=1960, ]


# -- Filter GDP and electricity losses to obtain the cyclical compoment. 

for (i in 1:length(unique(dt$iso3c))) {
tryCatch({
	ts.gdp <- as.xts(dt[iso3c == unique(dt$iso3c)[i],.(year = ymd(year, truncated = 2L), log(gdp.pcap)*100)])
	trend_cycle <- yth_filter(ts.gdp, h = 4, p = 2, output = c("x", "trend", "cycle"))
	dt[iso3c == unique(dt$iso3c)[i], c("hamilton.trend.gdp", "hamilton.cycle.gdp"):= list(as.numeric(trend_cycle[,2]),as.numeric(trend_cycle[,3]))]
	dt[iso3c == unique(dt$iso3c)[i], outgap.gdp.hamilton := (log(gdp.pcap)*100-hamilton.trend.gdp)/hamilton.trend.gdp*100]
}, error=function(e){})
tryCatch({
	ts.tdl <- as.xts(dt[iso3c == unique(dt$iso3c)[i],.(year = ymd(year, truncated = 2L), tdlosses)])
	trend_cycle <- yth_filter(ts.tdl, h = 4, p = 2, output = c("x", "trend", "cycle"))
	dt[iso3c == unique(dt$iso3c)[i], c("trend.tdl", "cycle.tdl"):= list(as.numeric(trend_cycle[,2]),as.numeric(trend_cycle[,3]))]
	dt[iso3c == unique(dt$iso3c)[i], outgap.tdl := (tdlosses-trend.tdl)/trend.tdl]
}, error=function(e){})
}


# -- Spatial Lags for instrumental variables
#country codes for Comtrade data
source("../code/functions/ctry.codes.R")
ctrycodes.data <- fread("ctrycodes.data.csv")
#spatial lag function for trade connectivity matrices 
source('../code/functions/spatial.lag2.r')
#spatial lag function for regional diffusion 
source('../code/functions/region.lag.R')
# keep developed countries separately
dt_dev <- subset(dt, LDC==0)

sitcorder  <- 1:240
year  <- 1960:2017
panel <- as.data.frame(cbind( 
	sitcorder= rep(sitcorder, length(year)),
	year = rep(year, each=length(sitcorder)) ))
panel$iso3c <- ctry.codes("sitcorder", "iso3c", panel$sitcorder)
names(panel)
dt <- merge(panel, subset(dt, LDC==1), by=c("iso3c", "year"), all.x =TRUE)

# subset to the relevant period and sort the data
dt <- subset(dt, year>=1970 & year<=2017)
dt <- dt[order(dt$year, dt$sitcorder),]	

# arguments for the "spatial.lag" function
period 		<- 1970:2015
path  		<-  paste('bilateral_trade/', sep='')
filenames 	<- 'bilateral_trade'
countrycode <- dt$iso3c
dt$region <- ifelse(is.na(dt$region.y), "", dt$region.y)
region		<- dt$region
dt$region23 <- ifelse(is.na(dt$region23), "", dt$region23)

# Generate spatial lags.
dt$lngdppc <- log(dt$gdp.pcap)*100
dt$w.lngdppc  <- spatial.lag(period, path, filenames, data=dt, yvar='lngdppc')
dt$w.gdp.growth  <- spatial.lag(period, path, filenames, data=dt, yvar='gdp.growth')
dt$w.gdp.pcap.growth  <- spatial.lag(period, path, filenames, data=dt, yvar='gdp.pcap.growth')
dt$w.outgap.gdp.hamilton  <- spatial.lag(period, path, filenames, data=dt, yvar='outgap.gdp.hamilton')
dt$w.hamilton.cycle.gdp  <- spatial.lag(period, path, filenames, data=dt, yvar='hamilton.cycle.gdp')
dt$w.outgap.tdl  <- spatial.lag(period, path, filenames, data=dt, yvar='outgap.tdl')
dt$wreg.democracy  <- region.lag(period, countrycode, region, data=dt, yvar='democracy')
dt$wreg.polity2  <- region.lag(period, countrycode, region, data=dt, yvar='polity2')
dt$wreg.polity_dem  <- region.lag(period, countrycode, region, data=dt, yvar='polity_dem')
dt$wreg.outgap.tdl  <- region.lag(period, countrycode, region, data=dt, yvar='outgap.tdl')

# row bind developed countries data
dt <- bind_rows(subset(dt, !iso3c %in% unique(dt_dev$iso3c)), subset(dt_dev, year>=1970 & year<=2017))

# -- Time Lags
source('../code/functions/lagpanel.R')
dt <- dt[order(dt$iso3c, dt$year),]	# sort for lagpanel
dt$l.tdlosses <- lagpanel(dt$tdlosses, dt$iso3c, dt$year, 1) 
dt$l2.tdlosses <- lagpanel(dt$tdlosses, dt$iso3c, dt$year, 2) 
dt$l3.tdlosses <- lagpanel(dt$tdlosses, dt$iso3c, dt$year, 3) 

dt$l.w.outgap.tdl <- lagpanel(dt$w.outgap.tdl, dt$iso3c, dt$year, 1) 
dt$l2.w.outgap.tdl <- lagpanel(dt$w.outgap.tdl, dt$iso3c, dt$year, 2) 

dt$l.wreg.outgap.tdl <- lagpanel(dt$wreg.outgap.tdl, dt$iso3c, dt$year, 2) 

dt$l.outgap.tdl <- lagpanel(dt$outgap.tdl, dt$iso3c, dt$year, 1) 
dt$l2.outgap.tdl <- lagpanel(dt$outgap.tdl, dt$iso3c, dt$year, 2) 
dt$l.cycle.tdl <- lagpanel(dt$cycle.tdl, dt$iso3c, dt$year, 1) 

dt$l.outgap.gdp.hamilton <- lagpanel(dt$outgap.gdp.hamilton, dt$iso3c, dt$year, 1) 
dt$l.w.outgap.gdp.hamilton <- lagpanel(dt$w.outgap.gdp.hamilton, dt$iso3c, dt$year, 1) 
dt$l2.w.outgap.gdp.hamilton <- lagpanel(dt$w.outgap.gdp.hamilton, dt$iso3c, dt$year, 2) 

dt$l.hamilton.cycle.gdp <- lagpanel(dt$hamilton.cycle.gdp, dt$iso3c, dt$year, 1) 
dt$l.w.hamilton.cycle.gdp <- lagpanel(dt$w.hamilton.cycle.gdp, dt$iso3c, dt$year, 1) 
dt$l.w.gdpgrowth <- lagpanel(dt$w.gdp.growth, dt$iso3c, dt$year, 1) 

dt$l.democracy <- lagpanel(dt$democracy, dt$iso3c, dt$year, 1) 
dt$l.wreg.democracy <- lagpanel(dt$wreg.democracy, dt$iso3c, dt$year, 1) 
dt$l.polity_dem <- lagpanel(dt$polity_dem, dt$iso3c, dt$year, 1) 
dt$l.polity2 <- lagpanel(dt$polity2, dt$iso3c, dt$year, 1) 
dt$l.wreg.polity2 <- lagpanel(dt$wreg.polity2, dt$iso3c, dt$year, 1) 

dt$l.pop <- lagpanel(dt$pop, dt$iso3c, dt$year, 1) 
dt$l.pop.density <- lagpanel(dt$pop.density, dt$iso3c, dt$year, 1) 
dt$l.gdp.pcap <- lagpanel(dt$gdp.pcap, dt$iso3c, dt$year, 1) 
dt$l.imports <- lagpanel(dt$imports, dt$iso3c, dt$year, 1) 
dt$l.exports <- lagpanel(dt$exports, dt$iso3c, dt$year, 1) 
dt$l.capacity <- lagpanel(dt$Capacity, dt$iso3c, dt$year, 1) 

dt$l.elec.access <- lagpanel(dt$elec.access, dt$iso3c, dt$year, 1) 
dt$l.elec.output <- lagpanel(dt$elec.output, dt$iso3c, dt$year, 1) 
dt$l.elec.cons.pc <- lagpanel(dt$elec.cons.pc, dt$iso3c, dt$year, 1) 

dt$l.elec.access.rural <- lagpanel(dt$elec.access.rural, dt$iso3c, dt$year, 1) 
dt$l.elec.access.urban <- lagpanel(dt$elec.access.urban, dt$iso3c, dt$year, 1) 
dt$l.elec.access.total <- lagpanel(dt$elec.access.total, dt$iso3c, dt$year, 1) 

dt <- as.data.table(dt)
dt <- dt[year>=1970 & year<=2017,]
dt <- dt[, -"V1"]

vars <- c("iso3c", "year", "LDC", "developed", "region23", "outgap.tdl", "Capacity", "democracy", "elec.cons.pc", "imports", "exports", "gdp.pcap", "outgap.gdp.hamilton", "pop", "pop.density", "l.outgap.tdl", "l.capacity", "l.democracy", "l.elec.cons.pc", "l.imports", "l.exports", "l.gdp.pcap", "l.outgap.gdp.hamilton", "l.pop", "l.pop.density", "l.w.outgap.gdp.hamilton", "l.wreg.democracy", "gwf_duration", "polity2", "tdlosses", "l.tdlosses", "elec.access.total", "l.elec.access.total", "l.polity2", "l.wreg.polity2", "r_prv", "Residential_Total price (2015 USD/MWh using PPP)", "ele_production_oil", "ele_production_hydro", "ele_production_nuclear", "crude_price_barrel", "oilrents_gdp", "r_cor", "r_reg", "bti_mo", "milpercap", "v2x_pubcorr", "fail", "archfail", "archdura1", "e_civil_war", "finittrm", "yrcurnt")
dt <- dt[, ..vars]

write.csv(dt, "dt_replication.csv")








